import pandas as pd
import numpy as np
import itertools as it
import re
from tqdm.notebook import tqdm
import warnings; warnings.filterwarnings('ignore')
# visualization tools
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
from wordcloud import WordCloud
import random
# nlp and ml tools
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
sns.set_context("talk")
sns.set_style('whitegrid', {'xtick.bottom':True, 'ytick.left':True})
Data is scraped from Yelp and can be found in a seperate notebook here. Up to 200 restaurants are selected for each of the top 25 counties in the US with the most deaths; per restaurant, up to 100 reviews are scraped.
df = pd.read_csv('review_data_2020_04_20_09_04.csv', index_col='Unnamed: 0')
df.head()
len(df)
# fix data types
df['publish_date'] = pd.to_datetime(df['publish_date'])
df = df.reset_index().rename(columns={'index':'restaurant_id'})
df.info()
# we know some reviews are duplicates because the restaurant falls into multiple cities
df[(df['author']=='Christopher V.') & (df['name']=='Cara Mia')]
df_clean = df.drop_duplicates(subset=['coordinates', 'name','description'])
# functions to run a bootstrap
np.random.seed(47)
def bootstrap_replicate_1d(data, func):
return func(np.random.choice(data, size=len(data)))
def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
return bs_replicates
def bootstrap(groupa, groupb):
mean_diff = np.mean(groupa) - np.mean(groupb)
bs_replicates_a = draw_bs_reps(groupa, np.mean, size=10000)
bs_replicates_b = draw_bs_reps(groupb, np.mean, size=10000)
bs_diff_replicates = bs_replicates_a - bs_replicates_b
conf_int = np.percentile(bs_diff_replicates, [2.5, 97.5])
# Compute mean of combined data set: combined_mean
combined_mean = np.mean(np.concatenate([groupa, groupb]))
# Shift the samples
shifted_a = groupa - np.mean(groupa) + combined_mean
shifted_b = groupb - np.mean(groupb) + combined_mean
# Get bootstrap replicates of shifted data sets
bs_replicates_a_shifted = draw_bs_reps(shifted_a, np.mean, 10000)
bs_replicates_b_shifted = draw_bs_reps(shifted_b, np.mean, 10000)
# Compute replicates of difference of means: bs_diff_replicates
bs_diff_replicates_shifted = bs_replicates_a_shifted - bs_replicates_b_shifted
# Compute the p-value for significantly higher values
p_high = np.sum(bs_diff_replicates_shifted >= mean_diff) / len(bs_diff_replicates_shifted)
# Compute the p-value for significantly lower values
p_low = np.sum(bs_diff_replicates_shifted <= mean_diff) / len(bs_diff_replicates_shifted)
return mean_diff, conf_int, p_high, p_low
def bootstrap_all(categorical_list, df_name, categorical_variable, test_variable):
#compare data for each variable in categorical_list to all other data
df_list = []
for i in tqdm(categorical_list):
others = df_name[df_name[categorical_variable]!=i].dropna(subset=[test_variable])
i_var = df_name[df_name[categorical_variable]==i].dropna(subset=[test_variable])
i_mean = np.mean(i_var[test_variable])
o_mean = np.mean(others[test_variable])
meandiff, conf_int, p_high, p_low = bootstrap(i_var[test_variable], others[test_variable])
df_list.append([i, i_mean, o_mean, meandiff, conf_int, p_high, p_low])
pval = pd.DataFrame(df_list)
pval.columns = ['Variable', 'Mean', 'Mean of Others', 'Mean Difference', '95% CI', 'p-value_high', 'p-value_low']
return pval
Here, I examine the geographical distribution of the restaurants used in this analysis.
# create subset dataset with only unique businesses, not all reviews
restaurant_df = df_clean[['restaurant_id', 'alias', 'categories', 'city_x', 'coordinates',
'is_closed', 'location','name', 'price', 'rating', 'review_count',
'transactions']].drop_duplicates()
# number of restaurants per location
restaurant_df['city_x'].value_counts()
# convert coordinates into unique columns for mapping
def coordinate_split(coordinates):
coordinates = re.split(', |: |}', coordinates)
return coordinates[1], coordinates[3]
restaurant_df[['latitude', 'longitude']] = restaurant_df['coordinates'].apply(coordinate_split).apply(pd.Series)
restaurant_df['text'] = restaurant_df['city_x'] + ' (' + restaurant_df['name'] + ')'
data = go.Scattergeo(lon = restaurant_df['longitude'],
lat = restaurant_df['latitude'],
text = restaurant_df['text'],
mode = 'markers',
#marker = dict(symbol='star', size=5, colorscale='Reds')
#marker_color = restaurant_df['']
)
layout = dict(title = 'Restaurants in Higly COVID-19-Affected Areas',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution.html')
If there are issues viewing this figure, you can access interactive plotly .html files through the plotly_figures.zip folder in the repository.
These areas are highly localized to major US cities. Additionally, because the data was pulled by searching Yelp with just county name, restaurants included are those close to the center of the county.
Additionally, some coordinates are clearly wrong for specific restaurants - one Pizza Hut in Orleans is showing up as in Virginia, and The Mark in New Jersey is shwoing up as in Ohio.
# add a column to denote reviews before and after March 15, when restaurant closures began
df_clean.loc[df['publish_date']>='2020-03-15', 'Post-COVID Lockdown'] = 1
df_clean['Post-COVID Lockdown'] = df_clean['Post-COVID Lockdown'].fillna(0)
# drop rows without a description
df_clean = df_clean[df_clean['description'].notna()]
# we have a limited sample of reviews after lockdown - scraping a longer time-period may help
df_clean['Post-COVID Lockdown'].value_counts()
a. Data Preparation for Sentiment Analysis
b. Overall Sentiment
VADER Scores
VADER Scores v. Review Stars
Overall Sentiment by Region
c. Sentiment Pre- and Post-Lockdown
Review Stars
Compound VADER Scores
Positive VADER Scores
Negative VADER Scores
d. Sentiment Pre- and Post-Lockdown by Region
Review Stars by Region
Compound Scores by Region
Positive Scores by Region
Negative Scores by Region
d. Sentiment Analysis Conclusions
For this part of the sentiment analysis, I'll use NTLK's VADER toolkit, which uses lemmatized text, punctuation, and capitalization to assign a positive, negative, neutral, and compound score to each row. Here, I clean the data as needed for VADER input.
# clean data for sentiment analysis
nlp = spacy.load('en')
# keep punctuation and caps because VADER uses these
# do get rid of stop words & lemmatize
# list of stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS
def sentiment_tokenizer(review):
# remove new lines
review = review.replace('\n', ' ')
mytokens = nlp(review)
# lemmatize, remove spaces
mytokens = [word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in mytokens if word.is_space==False]
# remove stop words
mytokens = [word for word in mytokens if word not in stop_words]
# join tokens back into a sentence
clean_review = ' '.join(mytokens)
return clean_review
sentiment_tokenizer(df_clean.loc[0, 'description'])
# apply sentiment tokenizer to all reviews
df_clean['sentiment_review'] = df_clean['description'].apply(sentiment_tokenizer)
# add VADER scores as columns based on cleaned up text
sid = SentimentIntensityAnalyzer()
df_clean[['neg', 'neu', 'pos', 'compound']] = df_clean['sentiment_review'].apply(lambda x: sid.polarity_scores(x)).apply(pd.Series)
df_clean.head()
# because text data takes forever to clean, add checkpoints where data is exported to a csv
df_clean.to_csv('checkpoint1.csv')
Evaluate review sentiment across all regions and all times.
Return to Top: Sentiment Analysis
NTLK's VADER toolkit gives a positive, negative, and neutral score to text data, along with a score combining the three: a compound score. The package authors describe the compound score as follows:
The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.
-- Source
Positive: Scale of 0 to 1, neutral to positive.
Negative: Scale of 0 to 1, neutral to negative.
Compound: Scale of -1 to 1, negative to positive. Combines postiive, negative, and neutral sentiments to create an overall sentiment metric.
df_clean = pd.read_csv('checkpoint1.csv', index_col='Unnamed: 0')
df_clean.head()
Below, I've printed reviews with the highest and lowest compound scores to sanity check.
# show reviews with highest vader compound scores
for idx, row in df_clean[df_clean['compound']==df_clean['compound'].max()].iterrows():
print('\n---Restaurant Name: {}---'.format(row['name']))
print('City: {}'.format(row['city_x']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
# show reviews with lowest vader compound scores
for idx, row in df_clean[df_clean['compound']==df_clean['compound'].min()].iterrows():
print('\n---Restaurant Name: {}---'.format(row['name']))
print('City: {}'.format(row['city_x']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
Because Yelp reviews are accompanied by a 1-5 star ranking, we can compare compound, positive, and negative VADER scores to determine how they correlate with self-assigned rankings.
# plot distribution of compound scores by review score
fig, axes = plt.subplots(5, 1, figsize=(10, 15))
sns.distplot(df_clean.loc[df_clean['score']==1,'compound'], hist=False, color='tab:red', label='1 Star', ax=axes[0])
sns.distplot(df_clean.loc[df_clean['score']==2,'compound'], hist=False, color='tab:orange', label='2 Stars', ax=axes[1])
sns.distplot(df_clean.loc[df_clean['score']==3,'compound'], hist=False, color='tab:olive', label='3 Stars', ax=axes[2])
sns.distplot(df_clean.loc[df_clean['score']==4,'compound'], hist=False, color='tab:green', label='4 Stars', ax=axes[3])
sns.distplot(df_clean.loc[df_clean['score']==5,'compound'], hist=False, color='tab:blue', label='5 Stars' ,ax=axes[4])
axes[0].set_title('Compound VADER Score by Review Stars')
for ax in range(4):
axes[ax].set_xlabel('')
for ax in range(5):
axes[ax].set_xlim(-1,1)
axes[4].set_xlabel('Compound VADER Score')
plt.tight_layout()
Scores of 1 and 2 are more likely to have low compound VADER scores, as expected; scores of 3 or more have highly positive VADER scores.
# plot distribution of positive scores by review score
fig, axes = plt.subplots(5, 1, figsize=(10, 15))
sns.distplot(df_clean.loc[df_clean['score']==1,'pos'], hist=False, color='tab:red', label='1 Star', ax=axes[0])
sns.distplot(df_clean.loc[df_clean['score']==2,'pos'], hist=False, color='tab:orange', label='2 Stars', ax=axes[1])
sns.distplot(df_clean.loc[df_clean['score']==3,'pos'], hist=False, color='tab:olive', label='3 Stars', ax=axes[2])
sns.distplot(df_clean.loc[df_clean['score']==4,'pos'], hist=False, color='tab:green', label='4 Stars', ax=axes[3])
sns.distplot(df_clean.loc[df_clean['score']==5,'pos'], hist=False, color='tab:blue', label='5 Stars' ,ax=axes[4])
axes[0].set_title('Positive VADER Score by Review Stars')
for ax in range(4):
axes[ax].set_xlabel('')
axes[4].set_xlabel('Positive VADER Score')
for ax in range(5):
axes[ax].set_xlim(0,1)
plt.tight_layout()
There is a clear right shift as stars increase in the positive VADER score, as expected. Of note, there are peaks at 0: these are reviews that have no positive terms. My guess is that these are peaks due to sample size - one positive word dramatically affects the positive score in reviews that are relatively short.
df_clean.loc[df_clean['pos']==0, ['description', 'neg', 'neu', 'pos', 'compound']].head()
# plot distribution of negative scores by review score
fig, axes = plt.subplots(5, 1, figsize=(10, 15))
sns.distplot(df_clean.loc[df_clean['score']==1,'neg'], hist=False, color='tab:red', label='1 Star', ax=axes[0])
sns.distplot(df_clean.loc[df_clean['score']==2,'neg'], hist=False, color='tab:orange', label='2 Stars', ax=axes[1])
sns.distplot(df_clean.loc[df_clean['score']==3,'neg'], hist=False, color='tab:olive', label='3 Stars', ax=axes[2])
sns.distplot(df_clean.loc[df_clean['score']==4,'neg'], hist=False, color='tab:green', label='4 Stars', ax=axes[3])
sns.distplot(df_clean.loc[df_clean['score']==5,'neg'], hist=False, color='tab:blue', label='5 Stars' ,ax=axes[4])
axes[0].set_title('Negative VADER Score by Review Stars')
for ax in range(4):
axes[ax].set_xlabel('')
for ax in range(5):
axes[ax].set_xlim(0,1)
axes[4].set_xlabel('Negative VADER Score')
plt.tight_layout()
With negative scores, we see an increase in neutral sentiment for 2-4 stars, as expected. The highest negative scores (recall, higher negative scores are worse, not more positive) can be observed with lower stars.
Again, we see odd peaks at 0, and those can also be explained by reviews that contain no negative terms.
df_clean.loc[df_clean['neg']==0, ['description', 'neg', 'neu', 'pos', 'compound']].head()
Generally, the VADER scores correlate with review stars, which confirms that sentiment can be determined using VADER.
Here, I evaluate overall sentiment by region, regardless of timeframe.
all_county_compound_bootstrap = bootstrap_all(df_clean['city_x'].unique(), df_clean, 'city_x', 'compound')
# counties with significantly higher compound scores overall
all_county_compound_bootstrap[all_county_compound_bootstrap['p-value_high']<0.05].sort_values('Mean', ascending=False)
# counties with significantly lower compound scores overall
all_county_compound_bootstrap[all_county_compound_bootstrap['p-value_low']<0.05].sort_values('Mean')
compound_restaurant_plot = restaurant_df.merge(df_clean.groupby('restaurant_id').mean()['compound'], left_on='restaurant_id', right_index=True)
compound_restaurant_plot['text'] = compound_restaurant_plot['city_x'] + ' (' + compound_restaurant_plot['name'] + '): ' + round(compound_restaurant_plot['compound'],2).astype(str)
data = go.Scattergeo(lon = compound_restaurant_plot['longitude'],
lat = compound_restaurant_plot['latitude'],
text = compound_restaurant_plot['text'],
mode = 'markers',
marker_color = compound_restaurant_plot['compound'],
)
layout = dict(title = 'Distribution of Restaurants with Compound Score Variation Shown',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution_compound.html')
While most restaurants have high compound scores of 0.7-1 (green), restaurants with lower mean scores are shown in pink/red. Some of the regions with significantly higher mean compound scores include Philadelphia, Pennsylvania; New York City, New York; and Los Angeles, California. Some of the regions with significantly lower compound scores include Orleans, Louisiana; King, Washington; and Nassau, New York.
all_county_pos_bootstrap = bootstrap_all(df_clean['city_x'].unique(), df_clean, 'city_x', 'pos')
# counties with significantly higher positive scores overall
all_county_pos_bootstrap[all_county_pos_bootstrap['p-value_high']<0.05].sort_values('Mean', ascending=False)
pos_restaurant_plot = restaurant_df.merge(df_clean.groupby('restaurant_id').mean()['pos'], left_on='restaurant_id', right_index=True)
pos_restaurant_plot['text'] = pos_restaurant_plot['city_x'] + ' (' + pos_restaurant_plot['name'] + '): ' + round(pos_restaurant_plot['pos'],2).astype(str)
data = go.Scattergeo(lon = pos_restaurant_plot['longitude'],
lat = pos_restaurant_plot['latitude'],
text = pos_restaurant_plot['text'],
mode = 'markers',
marker_color = pos_restaurant_plot['pos'],
)
layout = dict(title = 'Distribution of Restaurants with Positive Score Variation Shown',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution_positive.html')
Positive sentiment by region tends to range, on average, from 0.32-0.4. Regions with significantly higher positive scores include Rockland, New York; Bergen, New Jersey; and Fairfield, Connecticut. Significantly lower positive scores are excluded since lower scores indicate neutrality, not negativity.
all_county_neg_bootstrap = bootstrap_all(df_clean['city_x'].unique(), df_clean, 'city_x', 'neg')
# counties with significantly higher negative scores overall
all_county_neg_bootstrap[all_county_neg_bootstrap['p-value_high']<0.05]
neg_restaurant_plot = restaurant_df.merge(df_clean.groupby('restaurant_id').mean()['neg'], left_on='restaurant_id', right_index=True)
neg_restaurant_plot['text'] = neg_restaurant_plot['city_x'] + ' (' + neg_restaurant_plot['name'] + '): ' + round(neg_restaurant_plot['neg'],2).astype(str)
data = go.Scattergeo(lon = neg_restaurant_plot['longitude'],
lat = neg_restaurant_plot['latitude'],
text = neg_restaurant_plot['text'],
mode = 'markers',
marker_color = neg_restaurant_plot['neg'],
)
layout = dict(title = 'Distribution of Restaurants with Negative Score Variation Shown',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution_negative.html')
Negative sentiment by region tends to range, on average, from 0.04-0.08. Regions with significantly higher negative VADER scores include Middlesex, Massachusetts; Nassau, New York; and Suffolk, New York. Significantly lower negative scores are included since lower scores indicate neutrality, not positivity.
While there are trends by region, e.g., the overall most positive reviews occur in Philadelphia and New York and the most negative reviews occur in New Orleans and Seattle, zooming in on specific regions shows variation at the restaurant level, making it difficult to draw conclusions about trends in specific regions.
How have the distribution of review stars and compound, positive, and negative VADER scores changed before and after lockdown?
Here, I examine how review stars and VADER scores differ before and after lockdown, where lockdown is defined as March 15, 2020.
# create subset dfs based on covid lockdown
post_covid = df_clean[df_clean['Post-COVID Lockdown']==1]
pre_covid = df_clean[df_clean['Post-COVID Lockdown']==0]
Yelp reviews require the customer to assign a 1-5 star rating, 1 being the worst and 5 being the best.
plt.figure(figsize=(8,5))
plt.bar(pre_covid['score'].value_counts(normalize=True).sort_index().index,
pre_covid['score'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='palevioletred', width=1, label='Pre-Lockdown')
plt.bar(post_covid['score'].value_counts(normalize=True).sort_index().index,
post_covid['score'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='mediumseagreen', width=1, label='Post-Lockdown')
plt.title('Distribution of Review Stars')
plt.xlabel('Review Stars')
plt.ylabel('Proportion of Reviews')
plt.legend()
plt.tight_layout()
We can see that the proporiton of 5-star reviews has increased post-lockdown; however, the proportion of 1-star reviews appears to be about the same, and the proportion of neutral reviews has decreased.
# random sample of 5-star reviews post-lockdown
for idx, row in post_covid[post_covid['score']==5].sample(5).iterrows():
print('\n---Restaurant Name: {}---'.format(row['name']))
print('City: {}'.format(row['city_x']))
print('Review Stars: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
# sample of 1-star reviews post-lockdown
for idx, row in post_covid[post_covid['score']==1].sample(5).iterrows():
print('\n---Restaurant Name: {}---'.format(row['name']))
print('City: {}'.format(row['city_x']))
print('Review Stars: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
Here, I compare compound scores before and after lockdown.
plt.figure(figsize=(8,5))
plt.hist(pre_covid['compound'], alpha=0.5, color='palevioletred', bins=10, label='Pre-Lockdown', normed=True)
plt.hist(post_covid['compound'], alpha=0.5, color='mediumseagreen', bins=10, label='Post-Lockdown', normed=True)
plt.title('Distribution of Compound Scores')
plt.xlabel('Compound VADER Score')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
# plot distribution of compound scores by review stars pre- and post-lockdown
fig, axes = plt.subplots(5, 1, figsize=(10, 15))
sns.distplot(pre_covid.loc[pre_covid['score']==1,'compound'], hist=False, norm_hist=True, color='palevioletred', label='1 Star Pre-Lockdown', ax=axes[0])
sns.distplot(post_covid.loc[post_covid['score']==1,'compound'], hist=False, norm_hist=True, color='mediumseagreen', label='1 Star Post-Lockdown', ax=axes[0])
sns.distplot(pre_covid.loc[pre_covid['score']==2,'compound'], hist=False, norm_hist=True, color='palevioletred', label='2 Stars Pre-Lockdown', ax=axes[1])
sns.distplot(post_covid.loc[post_covid['score']==2,'compound'], hist=False, norm_hist=True, color='mediumseagreen', label='2 Stars Post-Lockdown', ax=axes[1])
sns.distplot(pre_covid.loc[pre_covid['score']==3,'compound'], hist=False, norm_hist=True, color='palevioletred', label='3 Stars Pre-Lockdown', ax=axes[2])
sns.distplot(post_covid.loc[post_covid['score']==3,'compound'], hist=False, norm_hist=True, color='mediumseagreen', label='3 Stars Post-Lockdown', ax=axes[2])
sns.distplot(pre_covid.loc[pre_covid['score']==4,'compound'], hist=False, norm_hist=True, color='palevioletred', label='4 Stars Pre-Lockdown', ax=axes[3])
sns.distplot(post_covid.loc[post_covid['score']==4,'compound'], hist=False, norm_hist=True, color='mediumseagreen', label='4 Stars Post-Lockdown', ax=axes[3])
sns.distplot(pre_covid.loc[pre_covid['score']==5,'compound'], hist=False, norm_hist=True, color='palevioletred', label='5 Stars Pre-Lockdown' ,ax=axes[4])
sns.distplot(post_covid.loc[post_covid['score']==5,'compound'], hist=False, norm_hist=True, color='mediumseagreen', label='5 Stars Post-Lockdown' ,ax=axes[4])
axes[0].set_title('Compound VADER Score Pre- and Post-Lockdown by Review Stars')
for ax in range(5):
axes[ax].set_xlabel('')
for ax in range(5):
axes[ax].set_xlim(-1,1)
fig.add_subplot(111, frame_on=False)
plt.tick_params(labelcolor="none", bottom=False, left=False)
plt.ylabel('Probability Density')
plt.xlabel('Compound VADER Score')
plt.grid(False)
plt.tight_layout()
compound_volume = data=df_clean.groupby('Post-COVID Lockdown').count()['compound']
fig, axs = plt.subplots(2, figsize=(10,10), sharex=True)
sns.boxplot(x='Post-COVID Lockdown', y='compound', data=df_clean,
palette=['palevioletred', 'mediumseagreen'], showfliers=False, ax=axs[0])
sns.barplot(x=compound_volume.index, y=compound_volume,
palette=['palevioletred', 'mediumseagreen'], ax=axs[1], ec='k')
axs[0].set_title('Compound VADER Scores Pre- and Post-Lockdown')
axs[0].set_xlabel('')
axs[0].set_ylabel('Compound VADER Score')
axs[1].set_xlabel('')
axs[1].set_ylabel('Number of Reviews')
plt.xticks([0, 1], ['Pre-Lockdown', 'Post-Lockdown'])
plt.tight_layout()
# compare compound VADER score before and after lockdown
mean_diff, conf_int, p_low, p_high = bootstrap(pre_covid['compound'], post_covid['compound'])
if p_low < 0.05:
print('The compound VADER score significantly decreased by a mean difference of {:.3f} in post-lockdown reviews; p={}'.format(np.abs(mean_diff), p_low))
elif p_high <0.05:
print('The compound VADER score significantly increased by a mean difference of {:.3f} in post-lockdown reviews; p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant change in compound VADER score; p={}'.format(p_low))
The compound VADER score is significantly higher in post-lockdown reviews, indicating that sentiment has improved. To determine if this is due to an increase in positive sentiment or a decrease in negative sentiment, I evaluate the positive and negative VADER scores the same way.
Of note, the data are really imbalanced - there are substantially more reviews pre-lockdown. A future direction might be to select one random review pre-lockdown for every review post-lockdown for a given restaurant to create a more balanced dataset.
# highest compound scores post-lockdown
for idx, row in post_covid[post_covid['compound']==post_covid['compound'].max()].iterrows():
print('\n---Restaurant Name: {}---'.format(row['name']))
print('City: {}'.format(row['city_x']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
# lowest compound scores post-lockdown
for idx, row in post_covid[post_covid['compound']==post_covid['compound'].min()].iterrows():
print('\n---Restaurant Name: {}---'.format(row['name']))
print('City: {}'.format(row['city_x']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
Overall, there is a decrease in sentiment post-lockdown; however, it is difficult to interpret this, since there are so many fewer reviews available in the current data,
Here, I compare positive scores before and after lockdown.
# add artificial max to post-covid data so scale is the same
artificial_max = post_covid[post_covid['pos']==post_covid['pos'].max()]
artificial_max['pos'] = 1
post_covid_plot = post_covid.append(artificial_max)
plt.figure(figsize=(8,5))
plt.hist(pre_covid['pos'], alpha=0.5, color='palevioletred', bins=10, label='Pre-Lockdown', normed=True)
plt.hist(post_covid_plot['pos'], alpha=0.5, color='mediumseagreen', bins=10, label='Post-Lockdown', normed=True)
plt.title('Distribution of Positive Scores')
plt.xlabel('Positive VADER Score')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
# plot distribution of positive scores by review for pre- or post-lockdown
fig, axes = plt.subplots(5, 1, figsize=(10, 15))
sns.distplot(pre_covid.loc[pre_covid['score']==1,'pos'], hist=False, norm_hist=True, color='palevioletred', label='1 Star Pre-Lockdown', ax=axes[0])
sns.distplot(post_covid.loc[post_covid['score']==1,'pos'], hist=False, norm_hist=True, color='mediumseagreen', label='1 Star Post-Lockdown', ax=axes[0])
sns.distplot(pre_covid.loc[pre_covid['score']==2,'pos'], hist=False, norm_hist=True, color='palevioletred', label='2 Stars Pre-Lockdown', ax=axes[1])
sns.distplot(post_covid.loc[post_covid['score']==2,'pos'], hist=False, norm_hist=True, color='mediumseagreen', label='2 Stars Post-Lockdown', ax=axes[1])
sns.distplot(pre_covid.loc[pre_covid['score']==3,'pos'], hist=False, norm_hist=True, color='palevioletred', label='3 Stars Pre-Lockdown', ax=axes[2])
sns.distplot(post_covid.loc[post_covid['score']==3,'pos'], hist=False, norm_hist=True, color='mediumseagreen', label='3 Stars Post-Lockdown', ax=axes[2])
sns.distplot(pre_covid.loc[pre_covid['score']==4,'pos'], hist=False, norm_hist=True, color='palevioletred', label='4 Stars Pre-Lockdown', ax=axes[3])
sns.distplot(post_covid.loc[post_covid['score']==4,'pos'], hist=False, norm_hist=True, color='mediumseagreen', label='4 Stars Post-Lockdown', ax=axes[3])
sns.distplot(pre_covid.loc[pre_covid['score']==5,'pos'], hist=False, norm_hist=True, color='palevioletred', label='5 Stars Pre-Lockdown' ,ax=axes[4])
sns.distplot(post_covid.loc[post_covid['score']==5,'pos'], hist=False, norm_hist=True, color='mediumseagreen', label='5 Stars Post-Lockdown' ,ax=axes[4])
axes[0].set_title('Positive VADER Score Pre- and Post-Lockdown by Review Stars')
for ax in range(5):
axes[ax].set_xlabel('')
for ax in range(5):
axes[ax].set_xlim(0,1)
fig.add_subplot(111, frame_on=False)
plt.tick_params(labelcolor="none", bottom=False, left=False)
plt.ylabel('Probability Density')
plt.xlabel('Positive VADER Score')
plt.grid(False)
plt.tight_layout()
pos_volume = data=df_clean.groupby('Post-COVID Lockdown').count()['pos']
fig, axs = plt.subplots(2, figsize=(10,10), sharex=True)
sns.boxplot(x='Post-COVID Lockdown', y='pos', data=df_clean,
palette=['palevioletred', 'mediumseagreen'], showfliers=False, ax=axs[0])
sns.barplot(x=pos_volume.index, y=pos_volume,
palette=['palevioletred', 'mediumseagreen'], ax=axs[1], ec='k')
axs[0].set_title('Positive VADER Scores Pre- and Post-Lockdown')
axs[0].set_xlabel('')
axs[0].set_ylabel('Positive VADER Score')
axs[1].set_xlabel('')
axs[1].set_ylabel('Number of Reviews')
plt.xticks([0, 1], ['Pre-Lockdown', 'Post-Lockdown'])
plt.tight_layout()
mean_diff, conf_int, p_low, p_high = bootstrap(pre_covid['pos'], post_covid['pos'])
if p_low < 0.05:
print('The positive VADER score significantly decreased by a mean difference of {:.3f} in post-lockdown reviews; p={}'.format(np.abs(mean_diff), p_low))
elif p_high <0.05:
print('The positive VADER score significantly increased by a mean difference of {:.3f} in post-lockdown reviews; p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant change in positive VADER score; p={}'.format(p_low))
Postive scores behave highly similarly before and after lockdown. We see a slight right-shift in the distribution towards more positive scores, though.
Here, I compare negative scores before and after lockdown.
# add artificial max to pre- and post-covid data so scale is the same
artificial_max_post = post_covid[post_covid['neg']==post_covid['neg'].max()]
artificial_max_post['neg'] = 1
post_covid_plot = post_covid.append(artificial_max_post)
artificial_max_pre = pre_covid[pre_covid['neg']==pre_covid['neg'].max()]
artificial_max_pre['neg'] = 1
pre_covid_plot = post_covid.append(artificial_max_pre)
plt.figure(figsize=(8,5))
plt.hist(pre_covid_plot['neg'], alpha=0.5, color='palevioletred', bins=10, label='Pre-Lockdown', normed=True)
plt.hist(post_covid_plot['neg'], alpha=0.5, color='mediumseagreen', bins=10, label='Post-Lockdown', normed=True)
plt.title('Distribution of Negative Scores')
plt.xlabel('Negative VADER Score')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
# plot distribution of negative scores by review for pre- or post-lockdown
fig, axes = plt.subplots(5, 1, figsize=(10, 15))
sns.distplot(pre_covid.loc[pre_covid['score']==1,'neg'], hist=False, norm_hist=True, color='palevioletred', label='1 Star Pre-Lockdown', ax=axes[0])
sns.distplot(post_covid.loc[post_covid['score']==1,'neg'], hist=False, norm_hist=True, color='mediumseagreen', label='1 Star Post-Lockdown', ax=axes[0])
sns.distplot(pre_covid.loc[pre_covid['score']==2,'neg'], hist=False, norm_hist=True, color='palevioletred', label='2 Stars Pre-Lockdown', ax=axes[1])
sns.distplot(post_covid.loc[post_covid['score']==2,'neg'], hist=False, norm_hist=True, color='mediumseagreen', label='2 Stars Post-Lockdown', ax=axes[1])
sns.distplot(pre_covid.loc[pre_covid['score']==3,'neg'], hist=False, norm_hist=True, color='palevioletred', label='3 Stars Pre-Lockdown', ax=axes[2])
sns.distplot(post_covid.loc[post_covid['score']==3,'neg'], hist=False, norm_hist=True, color='mediumseagreen', label='3 Stars Post-Lockdown', ax=axes[2])
sns.distplot(pre_covid.loc[pre_covid['score']==4,'neg'], hist=False, norm_hist=True, color='palevioletred', label='4 Stars Pre-Lockdown', ax=axes[3])
sns.distplot(post_covid.loc[post_covid['score']==4,'neg'], hist=False, norm_hist=True, color='mediumseagreen', label='4 Stars Post-Lockdown', ax=axes[3])
sns.distplot(pre_covid.loc[pre_covid['score']==5,'neg'], hist=False, norm_hist=True, color='palevioletred', label='5 Stars Pre-Lockdown' ,ax=axes[4])
sns.distplot(post_covid.loc[post_covid['score']==5,'neg'], hist=False, norm_hist=True, color='mediumseagreen', label='5 Stars Post-Lockdown' ,ax=axes[4])
axes[0].set_title('Negative VADER Score Pre- and post-Lockdown by Review Stars')
for ax in range(5):
axes[ax].set_xlabel('')
for ax in range(5):
axes[ax].set_xlim(0,1)
fig.add_subplot(111, frame_on=False)
plt.tick_params(labelcolor="none", bottom=False, left=False)
plt.ylabel('Probability Density')
plt.xlabel('Negative VADER Score')
plt.grid(False)
plt.tight_layout()
Some of the spikes at 0 are due to the same issues we saw previously: reviews with no terms deemed "negative". We know the data are imbalanced overall, with fewer negative and neutral reviews than positive reviews, which could explain some of these spikes, since the y-axis is normalized.
neg_volume = data=df_clean.groupby('Post-COVID Lockdown').count()['neg']
fig, axs = plt.subplots(2, figsize=(10,10), sharex=True)
sns.boxplot(x='Post-COVID Lockdown', y='neg', data=df_clean,
palette=['palevioletred', 'mediumseagreen'], showfliers=False, ax=axs[0])
sns.barplot(x=neg_volume.index, y=neg_volume,
palette=['palevioletred', 'mediumseagreen'], ax=axs[1], ec='k')
axs[0].set_title('Negative VADER Scores Pre- and Post-Lockdown')
axs[0].set_xlabel('')
axs[0].set_ylabel('Negative VADER Score')
axs[1].set_xlabel('')
axs[1].set_ylabel('Number of Reviews')
plt.xticks([0, 1], ['Pre-Lockdown', 'Post-Lockdown'])
plt.tight_layout()
mean_diff, conf_int, p_low, p_high = bootstrap(pre_covid['neg'], post_covid['neg'])
if p_low < 0.05:
print('The negative VADER score significantly decreased by a mean difference of {:.3f} in post-lockdown reviews; p={}'.format(np.abs(mean_diff), p_low))
elif p_high <0.05:
print('The negative VADER score significantly increased by a mean difference of {:.3f} in post-lockdown reviews; p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant change in negative VADER score; p={}'.format(p_low))
The patterns for negative scores again seem similar between the two time periods. The overall distribution of negative scores looks almost identical between the two time periods.
Overall, restaurant sentiment has increased post-lockdown due to a decrease in negative reviews; however, more data post-lockdown would really help solidify this.
Evaluate sentiment before and after lockdown for each county.
fig, axes = plt.subplots(13, 2, figsize=(10, 50))
# add one review with each star to each city before and after lockdown
# add one review with each star to each city before and after lockdown
add_city = []
add_star = []
for city in df_clean['city_x'].unique():
for star in range(1,6):
add_city.append(city)
add_star.append(star)
add_stars = {'city_x':add_city, 'score':add_star}
post_stars = post_covid.append(pd.DataFrame(add_stars))
pre_stars = pre_covid.append(pd.DataFrame(add_stars))
for idx, city in enumerate(df_clean['city_x'].unique()):
loc_x = 0
loc_y = 0
if idx % 2 == 0:
loc_x = int(idx/2)
loc_y = 0
else:
loc_x = int((idx-1)/2)
loc_y = 1
sns.barplot(x=post_stars.loc[post_stars['city_x']==city, 'score'].value_counts(normalize=True).sort_index().index,
y=post_stars.loc[post_stars['city_x']==city, 'score'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='mediumseagreen', label='Post-Lockdown',
ax=axes[loc_x, loc_y])
sns.barplot(x=pre_stars.loc[pre_stars['city_x']==city, 'score'].value_counts(normalize=True).sort_index().index,
y=pre_stars.loc[pre_stars['city_x']==city, 'score'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='palevioletred', label='Pre-Lockdown',
ax=axes[loc_x, loc_y])
axes[loc_x, loc_y].legend()
axes[loc_x, loc_y].set_title(city)
axes[loc_x, loc_y].set_ylabel('Frequency')
axes[loc_x, loc_y].set_xlabel('Review Stars')
plt.tight_layout()
In many cities, the proportion of 5-star reviews seems to increase and the proportion of 1-star reviews increases, indicating a decrease in neutral reviews. One city of note is Seattle, where the proportion of 1-star reviews decreased and the proportion of 5-star reviews dramatically increased, suggesting improved sentiment in the PNW.
Here, I compare compound VADER scores pre- and post-lockdown by county.
compound_vol_county = df_clean.groupby(['city_x', 'Post-COVID Lockdown']).count()['compound'].reset_index()
fig, axs = plt.subplots(2, figsize=(20,15), sharex=True)
sns.boxplot(x='city_x', y='compound', hue='Post-COVID Lockdown', data=df_clean,
palette=['palevioletred', 'mediumseagreen'], showfliers=False, ax=axs[0])
sns.barplot(x='city_x', y='compound', hue='Post-COVID Lockdown', data=compound_vol_county,
palette=['palevioletred', 'mediumseagreen'], ax=axs[1], ec='k')
axs[0].set_title('Compound VADER Scores Pre- and Post-Lockdown by Region', size=25)
axs[0].set_xlabel('')
axs[0].set_ylabel('Compound VADER Score')
axs[1].set_xlabel('')
axs[1].set_ylabel('Number of Reviews')
axs[0].get_legend().remove()
plt.legend(title='Before (0) or After (1) Lockdown')
plt.xticks(rotation=90)
plt.tight_layout()
# compare compound scores for each county pre- and post-lockdown
compound_county = []
for city in tqdm(df_clean['city_x'].unique()):
mean_diff, conf_int, p_low, p_high = bootstrap(df_clean.loc[(df_clean['city_x']==city) & (df_clean['Post-COVID Lockdown']==0), 'compound'],
df_clean.loc[(df_clean['city_x']==city) & (df_clean['Post-COVID Lockdown']==1), 'compound'])
print('------')
print('City: {}'.format(city))
if p_low < 0.05:
compound_county.append(city)
print('The compound VADER score significantly decreased after lockdown; mean difference=-{:.2f}, p={}'.format(np.abs(mean_diff), p_low))
elif p_high < 0.05:
compound_county.append(city)
print('The compound VADER score significantly increased after lockdown; mean difference={:.2f}, p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant difference in compound VADER score; p={}'.format(p_low))
# reviews with lowest compound scores in regions with sig. differences
for city in compound_county:
lowest_score = post_covid.loc[post_covid['city_x']==city, 'compound'].min()
for idx, row in post_covid[(post_covid['city_x']==city) & (post_covid['compound']==lowest_score)].iterrows():
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
restaurant_df.loc[restaurant_df['city_x'].isin(compound_county), 'Compound Affected'] = 1
restaurant_df['Compound Affected'] = restaurant_df['Compound Affected'].fillna(0)
data = go.Scattergeo(lon = restaurant_df['longitude'],
lat = restaurant_df['latitude'],
text = restaurant_df['city_x'],
mode = 'markers',
marker = dict(colorscale=['cornflowerblue', 'mediumorchid']),
marker_color = restaurant_df['Compound Affected'],
)
layout = dict(title = 'Regions with Significantly Affected Compound Scores Post-Lockdown (Purple)',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution_significant_compound.html')
There isn't a clear geographic trend of regions with significantly different compound scores post-lockdown; however, many regions do have significant differences, possibly indicating some have handled the trasition in business strategy better than others. We see significant decreases in sentiment in Philadelphia; Hartford; Jefferson, LA; Essex, NJ; NYC; LA; Bergen, NJ; and Cook, IL (Chicago).
pos_vol_county = df_clean.groupby(['city_x', 'Post-COVID Lockdown']).count()['pos'].reset_index()
fig, axs = plt.subplots(2, figsize=(20,15), sharex=True)
sns.boxplot(x='city_x', y='pos', hue='Post-COVID Lockdown', data=df_clean,
palette=['palevioletred', 'mediumseagreen'], showfliers=False, ax=axs[0])
sns.barplot(x='city_x', y='pos', hue='Post-COVID Lockdown', data=pos_vol_county,
palette=['palevioletred', 'mediumseagreen'], ax=axs[1], ec='k')
axs[0].set_title('Positive VADER Scores Pre- and Post-Lockdown by Region')
axs[0].set_xlabel('')
axs[0].set_ylabel('Positive VADER Score')
axs[1].set_xlabel('')
axs[1].set_ylabel('Number of Reviews')
axs[0].get_legend().remove()
plt.legend(title='Before (0) or After (1) Lockdown')
plt.xticks(rotation=90)
plt.tight_layout()
pos_county = []
for city in tqdm(df_clean['city_x'].unique()):
mean_diff, conf_int, p_low, p_high = bootstrap(df_clean.loc[(df_clean['city_x']==city) & (df_clean['Post-COVID Lockdown']==0), 'pos'],
df_clean.loc[(df_clean['city_x']==city) & (df_clean['Post-COVID Lockdown']==1), 'pos'])
print('------')
print('City: {}'.format(city))
if p_low < 0.05:
pos_county.append(city)
print('The positive VADER score significantly decreased after lockdown; mean difference=-{:.2f}, p={}'.format(np.abs(mean_diff), p_low))
elif p_high < 0.05:
pos_county.append(city)
print('The positive VADER score significantly increased after lockdown; mean difference={:.2f}, p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant difference in positive VADER score; p={}'.format(p_low))
# reviews with lowest positive scores in regions with sig. differences
for city in pos_county:
lowest_score = post_covid.loc[post_covid['city_x']==city, 'pos'].min()
for idx, row in post_covid[(post_covid['city_x']==city) & (post_covid['pos']==lowest_score)].iterrows():
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
# reviews with highest positive scores in regions with sig. differences
for city in pos_county:
highest_score = post_covid.loc[post_covid['city_x']==city, 'pos'].max()
for idx, row in post_covid[(post_covid['city_x']==city) & (post_covid['pos']==highest_score)].iterrows():
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
restaurant_df.loc[restaurant_df['city_x'].isin(pos_county), 'Positive Affected'] = 1
restaurant_df['Positive Affected'] = restaurant_df['Positive Affected'].fillna(0)
data = go.Scattergeo(lon = restaurant_df['longitude'],
lat = restaurant_df['latitude'],
text = restaurant_df['city_x'],
mode = 'markers',
marker = dict(colorscale=['cornflowerblue', 'mediumorchid']),
marker_color = restaurant_df['Positive Affected'],
)
layout = dict(title = 'Regions with Significantly Affected Positive Scores Post-Lockdown (Purple)',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution_significant_positive.html')
Again, there isn't a clear geographic trend of regions with significantly different positive scores post-lockdown. The only region with a significantly decreased positive score is New Orleans; LA and Seattle both showed a significant increase in positive scores.
Here, I examine negative VADER scores pre- and post-lockdown by region.
neg_vol_county = df_clean.groupby(['city_x', 'Post-COVID Lockdown']).count()['neg'].reset_index()
fig, axs = plt.subplots(2, figsize=(20,15), sharex=True)
sns.boxplot(x='city_x', y='neg', hue='Post-COVID Lockdown', data=df_clean,
palette=['palevioletred', 'mediumseagreen'], showfliers=False, ax=axs[0])
sns.barplot(x='city_x', y='neg', hue='Post-COVID Lockdown', data=neg_vol_county,
palette=['palevioletred', 'mediumseagreen'], ax=axs[1], ec='k')
axs[0].set_title('Negative VADER Scores Pre- and Post-Lockdown by Region')
axs[0].set_xlabel('')
axs[0].set_ylabel('Negative VADER Score')
axs[1].set_xlabel('')
axs[1].set_ylabel('Number of Reviews')
axs[0].get_legend().remove()
plt.legend(title='Before (0) or After (1) Lockdown')
plt.xticks(rotation=90)
plt.tight_layout()
neg_county = []
for city in tqdm(df_clean['city_x'].unique()):
mean_diff, conf_int, p_low, p_high = bootstrap(df_clean.loc[(df_clean['city_x']==city) & (df_clean['Post-COVID Lockdown']==0), 'neg'],
df_clean.loc[(df_clean['city_x']==city) & (df_clean['Post-COVID Lockdown']==1), 'neg'])
print('------')
print('City: {}'.format(city))
if p_low < 0.05:
neg_county.append(city)
print('The negative VADER score significantly decreased after lockdown; mean difference=-{:.2f}, p={}'.format(np.abs(mean_diff), p_low))
elif p_high < 0.05:
neg_county.append(city)
print('The negative VADER score significantly increased after lockdown; mean difference={:.2f}, p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant difference in negative VADER score; p={}'.format(p_low))
# reviews with highest negative scores in regions with sig. differences
for city in neg_county:
lowest_score = post_covid.loc[post_covid['city_x']==city, 'neg'].max()
for idx, row in post_covid[(post_covid['city_x']==city) & (post_covid['neg']==lowest_score)].iterrows():
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
restaurant_df.loc[restaurant_df['city_x'].isin(neg_county), 'Negative Affected'] = 1
restaurant_df['Negative Affected'] = restaurant_df['Negative Affected'].fillna(0)
data = go.Scattergeo(lon = restaurant_df['longitude'],
lat = restaurant_df['latitude'],
text = restaurant_df['city_x'],
mode = 'markers',
marker = dict(colorscale=['cornflowerblue', 'mediumorchid']),
marker_color = restaurant_df['Negative Affected'],
)
layout = dict(title = 'Regions with Significantly Affected Negative Scores Post-Lockdown (Purple)',
geo_scope = 'usa')
choromap = go.Figure(data=[data], layout=layout)
iplot(choromap)
choromap.write_html('plotly_figures/restaurant_distribution_significant_negative.html')
A number of counties experience a significant increase in negative scores (i.e., worse sentiment), including Hartford; Jefferson, LA; Nassau, NJ; Suffolk, NY; NYC; LA; and Cook, IL.
While it's difficult to draw broad conclusions with a limited sample size, in general, there is a decrease in sentiment post-lockdown.
Sentiment is variable on a by-restaurant basis, and there are no clear overall geographical trends. This could be studied further with more restaurant data.
Examining all reviews together, there appears to be a decrease in sentiment post-lockdown due to an increase in negative reviews; however, the sample size post-lockdown is very small. At the time of scraping, only a month of reviews were available; a repeat analysis with more recent would be more informative.
Regionally, while some counties don't have clear trends, many also show a decrease in sentiment, also due to an increase in negative reviews. Highly cornavirus-affected areas such as New York City, LA, and adjacent counties are included in this list.
a. Additional Data Cleaning
b. TF-IDF Modeling
c. KMeans Clustering
d. PCA
e. Topic Modeling
f. Coronavirus-Like Review Sentiment
Sentiment of the Coronavirus Cluster
Comparison of Sentiment by Cluster
Sample Reviews for the Coronavirus-Like Cluster
Geographical Distribution of Coronavirus-Like Reviews
g. Modeling Conclusions
VADER requires some text components to remain intact, such as capitalization of words and punctuation, since these can influence sentiment. However. for the remainder of the analysis, we need to further clean the review text.
nlp = spacy.load('en')
def sentiment_tokenizer_complete(review):
# remove new lines
review = review.replace('\n', ' ')
mytokens = nlp(review)
# lemmatize; lowercase; remove spaces, punctuation, and numbers
mytokens = [word.lemma_.lower() if word.lemma_ != '-PRON-' else word.lower_ for word in mytokens
if word.is_space==False and
word.is_punct==False and
word.pos_!='NUM']
# remove stop words
mytokens = [word for word in mytokens if word not in STOP_WORDS]
# remove words <=1 letter
mytokens = [word for word in mytokens if len(word)>1]
# join tokens back into a sentence
clean_review = ' '.join(mytokens)
return clean_review
%time df_clean['clean_review'] = df_clean['description'].apply(sentiment_tokenizer_complete)
# checkpoint in case the kernel dies
df_clean.to_csv('checkpoint2.csv', index=False)
# in case the kernel dies
df_clean = pd.read_csv('checkpoint2.csv')
Here, I use TF-IDF to vectorize the reviews for further analyses.
For TF-IDF modeling, I've limited the sample size to reviews only since March 1, 2020. All data ends up using >12GB of memory and causing the kernel on my computer to crash; one way around this would be running the clustering on a remote server.
# shape prior to reducing size
df_clean.shape
# only use reviews since march 1 to reduce size
df_clean['publish_date'] = pd.to_datetime(df_clean['publish_date'])
df_short = df_clean[df_clean['publish_date']>='2020-03-01']
df_short.shape
# add tf-idfs columns
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(df_short['clean_review']).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = df_short.index
tfidf_df.shape
Using the elbow method, I test different values of k to determine how many clusters should be used with the TF-IDF data.
# test different values of k
ks = range(2,35)
inertias = []
for k in ks:
model = KMeans(n_clusters=k)
model.fit(tfidf_df)
inertias.append(model.inertia_)
plt.figure(figsize=(15,5))
plt.plot(ks, inertias, '-o')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.xticks(ks)
plt.show()
There isn't a clear elbow, although at 13 there is a slight plateau.
# model to assign values - going with 13 clusters
kmeans = KMeans(n_clusters=13)
y_pred = kmeans.fit_predict(tfidf_df)
# add cluster assignment to the tfidf dataframe
tfidf_df['cluster'] = y_pred
plt.figure(figsize=(15,5))
plt.bar([i for i in range(1, 14)], tfidf_df['cluster'].value_counts(sort=False), ec='k')
plt.xticks([i for i in range(1, 14)])
plt.xlabel('Cluster')
plt.ylabel('Number of Reviews')
Most reviews are in the 13th cluster. Next, I've performed a PCA to visualize the clusters.
I use a PCA to reduce the dimensions of the TF-IDF data and visualize the clusters on a 2-D plot.
# pca to visualize clusters
pca = PCA(n_components=2)
# Apply the fit_transform method of model to grains: pca_features
pca_features = pca.fit_transform(tfidf_df.drop(columns='cluster'))
# some columns to append to
df_pca = tfidf_df[['cluster']]
# Assign 0th column of pca_features: xs
df_pca['x'] = pca_features[:,0]
# Assign 1st column of pca_features: ys
df_pca['y'] = pca_features[:,1]
# Scatter plot of first and second component of PCA
plt.figure(figsize=(10,10))
for cluster in range(13):
plt.scatter(df_pca.loc[df_pca['cluster']==cluster, 'x'], df_pca.loc[df_pca['cluster']==cluster, 'y'], label=cluster, alpha=0.5)
plt.plot()
plt.legend()
plt.title('PCA of TF-IDF')
plt.show()
import plotly.graph_objects as go
import numpy as np
fig = go.Figure()
for cluster in range(13):
fig.add_trace(go.Scatter(x=df_pca.loc[df_pca['cluster']==cluster, 'x'],
y=df_pca.loc[df_pca['cluster']==cluster, 'y'],
name=cluster,
mode='markers'))
# Set options common to all traces with fig.update_traces
fig.update_traces(mode='markers', marker_line_width=2, marker_size=10)
fig.update_layout(title='Interactive PCA',
yaxis_zeroline=False, xaxis_zeroline=False)
fig.write_html('plotly_figures/interactive_pca.html')
fig.show()
Based on these visualizations, we can see how the clusters might be seperate, although there appears to be overlap. This makes sense: similar words are likely used in all restaurant reviews, such as "restaurant" or "service" and could appear in many reviews.
# get mean for each vector by cluster
tfidf_means = tfidf_df.groupby('cluster').mean()
top_feats = {}
for cluster in range(13):
top_feats[cluster] = tfidf_means.iloc[cluster].sort_values(ascending=False).head(5).index.to_list()
print('Cluster: {}'.format(cluster))
print([feat.split('word_')[1] for feat in top_feats[cluster]])
print('---')
Here, I use the clusters to identify terms and sentiment of terms associated with COVID-19.
# see which clusters seem to have the largest values for coronavirus words
coronavirus_words = ['covid', 'coronavirus', 'corona', 'covid-19', 'virus', 'pandemic', 'quarantine']
for word in coronavirus_words:
if 'word_{}'.format(word) in tfidf_means.columns:
max_value = tfidf_means['word_{}'.format(word)].max()
max_cluster = tfidf_means[tfidf_means['word_{}'.format(word)]==max_value].index.to_list()
print('"{}": Cluster {}'.format(word.capitalize(), max_cluster[0]))
# see which clusters seem to have the largest values for random words as a sanity check
not_coronavirus_words = ['mexican', 'pasta', 'sushi', 'takeout', 'service', 'chicken', 'ambiance']
for word in not_coronavirus_words:
if 'word_{}'.format(word) in tfidf_means.columns:
max_value = tfidf_means['word_{}'.format(word)].max()
max_cluster = tfidf_means[tfidf_means['word_{}'.format(word)]==max_value].index.to_list()
print('"{}": Cluster {}'.format(word.capitalize(), max_cluster[0]))
Coronavirus terms appear to cluster together (the cluster number changes each time the KMeans is run, so I've printed it below). For a sanity check, I included some random terms as well in a second check to see where they might cluster. Next, I'll investigate what other words are associated with the coronavirus cluster.
if 'word_coronavirus' in tfidf_means.columns:
max_value = tfidf_means['word_coronavirus'].max()
corona_cluster = tfidf_means[tfidf_means['word_coronavirus']==max_value].index.to_list()[0]
print('Coronavirus cluster: {}'.format(corona_cluster))
print('Top terms associated with COVID-19-like Terms:')
print([feat.split('word_')[1] for feat in tfidf_means.iloc[corona_cluster].sort_values(ascending=False).head(50).index.to_list()])
## examine reviews from the corona cluster
# join tfidf to df_short
df_join = df_short.merge(tfidf_df, left_index=True, right_index=True)
# wordcloud of corona cluster
font_path = '/System/Library/Fonts/Supplemental/DIN Condensed Bold.ttf'
from palettable.colorbrewer.sequential import GnBu_9, Reds_8
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
if word in coronavirus_words:
return tuple(Reds_8.colors[random.randint(3,7)])
return tuple(GnBu_9.colors[random.randint(3,8)])
wc = WordCloud(font_path=font_path,
background_color="white",
width=1000,
height=600,
max_words=500,
max_font_size=300,
random_state=42)
plt.figure(figsize=(15,15))
wc.generate(str(df_join.loc[df_join['cluster']==corona_cluster, 'clean_review']))
wc.recolor(color_func=color_func, random_state=3)
wc.to_file('covid_wordcloud.png')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
Now that I've identified a cluster of reviews that associate with coronavirus-like terms, I look at the sentiment of these reviews compared to others.
Next, I examined how review sentiment was associated with the coronavirus cluster compared to all clusters.
# examine range of VADER scores and review scores
print('Coronavirus Cluster Statistics:')
print('Compound VADER scores:')
print('Mean: {:.4f}'.format(df_join.loc[df_join['cluster']==corona_cluster, 'compound'].mean()))
print('Low: {}; High: {}'.format(df_join.loc[df_join['cluster']==corona_cluster, 'compound'].min(), df_join.loc[df_join['cluster']==corona_cluster, 'compound'].max()))
print('---')
print('Positive VADER scores:')
print('Mean: {:.4f}'.format(df_join.loc[df_join['cluster']==corona_cluster, 'pos'].mean()))
print('Low: {}; High: {}'.format(df_join.loc[df_join['cluster']==corona_cluster, 'pos'].min(), df_join.loc[df_join['cluster']==corona_cluster, 'pos'].max()))
print('---')
print('Negative VADER scores:')
print('Mean: {:.4f}'.format(df_join.loc[df_join['cluster']==corona_cluster, 'neg'].mean()))
print('Low: {}; High: {}'.format(df_join.loc[df_join['cluster']==corona_cluster, 'neg'].min(), df_join.loc[df_join['cluster']==corona_cluster, 'neg'].max()))
print('\n---')
print('Overall Statistics:')
print('Compound VADER scores:')
print('Mean: {:.4f}'.format(df_join['compound'].mean()))
print('Low: {}; High: {}'.format(df_join['compound'].min(), df_join['compound'].max()))
print('---')
print('Positive VADER scores:')
print('Mean: {:.4f}'.format(df_join['pos'].mean()))
print('Low: {}; High: {}'.format(df_join['pos'].min(), df_join['pos'].max()))
print('---')
print('Negative VADER scores:')
print('Mean: {:.4f}'.format(df_join['neg'].mean()))
print('Low: {}; High: {:.4f}'.format(df_join['neg'].min(), df_join['neg'].max()))
print('\n---')
# VADER scores for cluster corona_cluster v all other clusters
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
sns.boxplot(df_join['compound'], color='palevioletred', ax=axes[0,0])
sns.boxplot(df_join['pos'], color='palevioletred', ax=axes[0,1])
sns.boxplot(df_join['neg'], color='palevioletred', ax=axes[0,2])
axes[0,0].set_xlabel('Overall Compound VADER Score')
axes[0,1].set_xlabel('Overall Positive VADER Score')
axes[0,2].set_xlabel('Overall Negative VADER Score')
sns.boxplot(df_join.loc[df_join['cluster']==corona_cluster, 'compound'], color='mediumseagreen', ax=axes[1,0])
sns.boxplot(df_join.loc[df_join['cluster']==corona_cluster, 'pos'], color='mediumseagreen', ax=axes[1,1])
sns.boxplot(df_join.loc[df_join['cluster']==corona_cluster, 'neg'], color='mediumseagreen', ax=axes[1,2])
axes[1,0].set_xlabel('Coronavirus Cluster \nCompound VADER Score')
axes[1,1].set_xlabel('Coronavirus Cluster \nPositive VADER Score')
axes[1,2].set_xlabel('Coronavirus Cluster \nNegative VADER Score')
# same axes - compound ranges -1 to 1; pos and neg range 0 to 1
axes[0,0].set_xlim(-1,1)
axes[1,0].set_xlim(-1,1)
axes[0,1].set_xlim(0,1)
axes[1,1].set_xlim(0,1)
axes[0,2].set_xlim(0,1)
axes[1,2].set_xlim(0,1)
plt.tight_layout()
# test for significant differences between cluster corona_cluster and all data
# compare compound VADER score
mean_diff, conf_int, p_low, p_high = bootstrap(df_join['compound'], df_join.loc[df_join['cluster']==corona_cluster, 'compound'])
if p_low < 0.05:
print('The compound VADER score significantly decreased by a mean difference of {:.3f}; p={}'.format(np.abs(mean_diff), p_low))
elif p_high <0.05:
print('The compound VADER score significantly increased by a mean difference of {:.3f}; p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant change in compound VADER score; p={}'.format(p_low))
# compare pos VADER score
mean_diff, conf_int, p_low, p_high = bootstrap(df_join['pos'], df_join.loc[df_join['cluster']==corona_cluster, 'pos'])
if p_low < 0.05:
print('The positive VADER score significantly decreased by a mean difference of {:.3f}; p={}'.format(np.abs(mean_diff), p_low))
elif p_high <0.05:
print('The positive VADER score significantly increased by a mean difference of {:.3f}; p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant change in positive VADER score; p={}'.format(p_low))
# compare neg VADER score
mean_diff, conf_int, p_low, p_high = bootstrap(df_join['neg'], df_join.loc[df_join['cluster']==corona_cluster, 'neg'])
if p_low < 0.05:
print('The negative VADER score significantly decreased by a mean difference of {:.3f}; p={}'.format(np.abs(mean_diff), p_low))
elif p_high <0.05:
print('The negative VADER score significantly increased by a mean difference of {:.3f}; p={}'.format(np.abs(mean_diff), p_high))
else:
print('No significant change in negative VADER score; p={}'.format(p_low))
In the coronavirus cluster, we see a significant decrease in positive sentiment, increase in negative sentiment and decrease in compound (overall) sentment compared to the whole group. We can confirm that VADER got it right using the scores left by reviewers (review stars).
plt.figure(figsize=(8,5))
sns.barplot(df_join['score'].value_counts(normalize=True).sort_index().index,
df_join['score'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='palevioletred', label='Overall')
sns.barplot(df_join.loc[df_join['cluster']==corona_cluster, 'score'].value_counts(normalize=True).sort_index().index,
df_join.loc[df_join['cluster']==corona_cluster, 'score'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='mediumseagreen', label='Coronavirus Cluster')
plt.title('Distribution of Review Stars Overall v. Coronavirus Cluster')
plt.xlabel('Review Stars')
plt.ylabel('Proportion of Stars')
plt.legend()
plt.tight_layout()
plt.savefig('corona_cluster_review_stars.jpg', dpi=500)
Here, we see an increase in the proportion of 1-star reviews in the coronavirus cluster and a decrease in 4- and 5-star reviews.
# 1-star review corona cluster wordcloud
font_path = '/System/Library/Fonts/Supplemental/DIN Condensed Bold.ttf'
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return tuple(Reds_8.colors[random.randint(2,7)])
wc = WordCloud(font_path=font_path,
background_color="white",
width=1000,
height=600,
max_words=500,
max_font_size=300,
random_state=42)
plt.figure(figsize=(15,15))
wc.generate(str(df_join.loc[(df_join['cluster']==corona_cluster) &
(df_join['score']==1), 'clean_review']))
wc.recolor(color_func=color_func, random_state=3)
wc.to_file('1_star_covid_wordcloud.png')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
# 5 star corona cluster review wordcloud
font_path = '/System/Library/Fonts/Supplemental/DIN Condensed Bold.ttf'
from palettable.colorbrewer.sequential import Greens_8
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return tuple(Greens_8.colors[random.randint(2,7)])
wc = WordCloud(font_path=font_path,
background_color="white",
width=1000,
height=600,
max_words=500,
max_font_size=300,
random_state=42)
plt.figure(figsize=(15,15))
wc.generate(str(df_join.loc[(df_join['cluster']==corona_cluster) &
(df_join['score']==5), 'clean_review']))
wc.recolor(color_func=color_func, random_state=3)
wc.to_file('5_star_covid_wordcloud.png')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
I compare the sentiment of the coronavirus-like cluster to all individual clusters.
colors = ['mediumseagreen' if i==corona_cluster else 'palevioletred' for i in range(13)]
cluster_palette = dict(zip(range(13), colors))
cluster_compound_vol = df_join.groupby('cluster').count()['compound']
fig, axs = plt.subplots(2, figsize=(15,10), sharex=True)
sns.boxplot(x='cluster', y='compound', data=df_join,
showfliers=False, ax=axs[0], palette=cluster_palette)
sns.barplot(x=cluster_compound_vol.index, y=cluster_compound_vol,
ax=axs[1], ec='k', palette=cluster_palette)
axs[0].set_title('Compound VADER Scores by Cluster')
axs[0].set_xlabel('')
axs[0].set_ylabel('Compound VADER Score')
axs[1].set_xlabel('Cluster')
axs[1].set_ylabel('Number of Reviews')
plt.savefig('corona_cluster_compound_vader.jpg', dpi=500)
plt.tight_layout()
cluster_compound_bootstrap = bootstrap_all(df_join['cluster'].unique(), df_join, 'cluster', 'compound')
cluster_compound_bootstrap[cluster_compound_bootstrap['p-value_low']<0.05].sort_values('Mean', ascending=False)
We can see that the coronavirus cluster (colored in green) has a significantly lower compound VADER score than the other clusters; not only is this difference significant, but it's extreme, with a drop of ~0.24.
cluster_compound_vol = df_join.groupby('cluster').count()['pos']
fig, axs = plt.subplots(2, figsize=(15,10), sharex=True)
sns.boxplot(x='cluster', y='pos', data=df_join,
showfliers=False, ax=axs[0], palette=cluster_palette)
sns.barplot(x=cluster_compound_vol.index, y=cluster_compound_vol,
ax=axs[1], ec='k', palette=cluster_palette)
axs[0].set_title('Positive VADER Scores by Cluster')
axs[0].set_xlabel('')
axs[0].set_ylabel('Positive VADER Score')
axs[1].set_xlabel('Cluster')
axs[1].set_ylabel('Number of Reviews')
plt.savefig('corona_cluster_pos_vader.jpg', dpi=500)
plt.tight_layout()
cluster_pos_bootstrap = bootstrap_all(df_join['cluster'].unique(), df_join, 'cluster', 'pos')
cluster_pos_bootstrap[cluster_pos_bootstrap['p-value_low']<0.05].sort_values('Mean', ascending=False)
Again, we see that the coronavirus-like cluster has a significantly lower positive VADER score than the other clusters.
cluster_compound_vol = df_join.groupby('cluster').count()['neg']
fig, axs = plt.subplots(2, figsize=(15,10), sharex=True)
sns.boxplot(x='cluster', y='neg', data=df_join,
showfliers=False, ax=axs[0], palette=cluster_palette)
sns.barplot(x=cluster_compound_vol.index, y=cluster_compound_vol,
ax=axs[1], ec='k', palette=cluster_palette)
axs[0].set_title('Negative VADER Scores by Cluster')
axs[0].set_xlabel('')
axs[0].set_ylabel('Negative VADER Score')
axs[1].set_xlabel('Cluster')
axs[1].set_ylabel('Number of Reviews')
plt.tight_layout()
cluster_neg_bootstrap = bootstrap_all(df_join['cluster'].unique(), df_join, 'cluster', 'neg')
cluster_neg_bootstrap[cluster_neg_bootstrap['p-value_high']<0.05].sort_values('Mean', ascending=False)
Here, we see that the coronavirus-like cluster has a significantly higher negative VADER score than the other clusters.
# 5 reviews with lowest compound scores in coronavirus cluster
# the first time i ran this, it was cluster 12, so thats where all the 12s are coming from
# create list of reviews listed so we can exclude them easily later
first_neg_reviews = []
for idx, row in df_join[(df_join['cluster']==corona_cluster)].sort_values('compound').head(5).iterrows():
first_neg_reviews.append(idx)
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
Several of the negative reviews do not explicitly mention COVID-19 and could likely be from before those restaurants switched business strategies; however, several of the others have consistent themes mentioning price, food quality, and service.
# 5 reviews with highest compound scores overall
first_pos_reviews = []
for idx, row in df_join[(df_join['cluster']==corona_cluster)].sort_values('compound', ascending=False).head(5).iterrows():
first_pos_reviews.append(idx)
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
Several of the top positive reviews appear to be from prior to lockdown. Just as an aside, I would guess review length is almost definitely associated with review sentiment - at the very least, short reviews are more likely to be neutral, but it seems the positive reviews are very long.
Here, I look again at both negative and positive reviews, but only those left since April 1, 2020. I've excluded reviews we've already seen.
newest_reviews = df_join[(df_join['publish_date']>'2020-04-01') & (~df_join.index.isin(first_neg_reviews))]
# 10 reviews with highest compound scores in cluster 12 since April 1
for idx, row in newest_reviews[(newest_reviews['cluster']==corona_cluster)].sort_values('compound').head(5).iterrows():
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
newest_reviews = df_join[(df_join['publish_date']>'2020-04-01') & (~df_join.index.isin(first_pos_reviews))]
# 10 reviews with highest compound scores in cluster 12 since April 1
for idx, row in newest_reviews[(newest_reviews['cluster']==corona_cluster)].sort_values('compound', ascending=False).head(5).iterrows():
print('\n---City: {}---'.format(row['city_x']))
print('Restaurant Name: {}'.format(row['name']))
print('Review Score: {}'.format(row['score']))
print('Review Date: {}\n'.format(row['publish_date']))
print(row['description'])
To look at city by cluster, below, I show the proportions of reviews by city (normalized) overall and by the coronavirus cluster.
plt.figure(figsize=(12,8))
sns.barplot(df_join['city_x'].value_counts(normalize=True).sort_index().index,
df_join['city_x'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='palevioletred', label='Overall')
sns.barplot(df_join.loc[df_join['cluster']==corona_cluster, 'city_x'].value_counts(normalize=True).sort_index().index,
df_join.loc[df_join['cluster']==corona_cluster, 'city_x'].value_counts(normalize=True).sort_index(),
alpha=0.5, color='mediumseagreen', label='Coronavirus Cluster')
plt.title('Distribution of Review Locations Overall v. Coronavirus Cluster')
plt.xlabel('City')
plt.ylabel('Proportion of Reviews')
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
It appears that COVID-19-like reviews are less likely to come from Jefferson, LA (New Orleans); NYC; New Haven; and Philadelphia. I hypothesize that this decrease may be due to higher impacts of COVID-19 in particularly dense areas, especially in NYC, where eating out may have decreased as a result of the high impacts of the pandemic. Social distancing measures similarly may make it more difficult to get take-out in densely populated areas, whereas in more suburban areas, no-contact takeout and delivery could be more feasible.
Using clustering, we're able to see that coronavirus-like terms do cluster together, along with terms such as "order", "time", "delivery", "support", "takeout", "service", and even "online" - which makes sense, given the drastically changed business model restaurants across the U.S. have been forced to adopt since early March.
This cluster has significantly worse sentiment than the other clusters, which I thought initially could be due to words like "pandemic" being classified as negative words by VADER; however, I confirmed lower sentmeint using review stars. Of note, this cluster is easily identified as having lower sentiment visually using a boxplot - the compound VADER score is substantially lower than that of other clusters.
Geographically, reviews that cluster with coronavirus-like terms seem to come from counties across the country, but less so from some of the regions most impacted, such as NYC.
To expand and improve this analysis, I would do the following in the future: